# -*- coding: utf-8 -*-

"""
CCCF优化版设置文件
解决并发下载问题，提高爬取性能
"""

import os
from pathlib import Path

# Scrapy settings for fire_control_spider project

BOT_NAME = 'fire_control_spider'

SPIDER_MODULES = ['fire_control_spider.sites.beijingfire']
NEWSPIDER_MODULE = 'fire_control_spider.sites.beijingfire'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# ==================== 并发优化设置 ====================
# 总并发数
CONCURRENT_REQUESTS = 16

# 下载延迟 - 减少延迟提高并发
DOWNLOAD_DELAY = 0.1

# 每个域名的并发数
CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 8

# 随机化下载延迟
RANDOMIZE_DOWNLOAD_DELAY = True

# ==================== AutoThrottle优化 ====================
# 启用自动限速
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 3
# 关键修改：提高目标并发数
AUTOTHROTTLE_TARGET_CONCURRENCY = 8.0
AUTOTHROTTLE_DEBUG = False

# ==================== 网络设置 ====================
# 下载超时
DOWNLOAD_TIMEOUT = 30

# 重试设置
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 408, 429]

# 禁用cookies以提高性能
COOKIES_ENABLED = False

# ==================== 请求头设置 ====================
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Connection': 'keep-alive',  # 保持连接
    'Cache-Control': 'no-cache',  # 禁用缓存
}

# ==================== 中间件设置 ====================
SPIDER_MIDDLEWARES = {
    'fire_control_spider.middlewares.SpiderMiddleware': 543,
}

DOWNLOADER_MIDDLEWARES = {
    'fire_control_spider.middlewares.ProxyMiddleware': 350,
    'fire_control_spider.middlewares.UserAgentMiddleware': 400,
    'fire_control_spider.middlewares.RetryMiddleware': 500,
    # 添加并发监控中间件
    'fire_control_spider.middlewares.ConcurrencyMonitorMiddleware': 600,
}

# ==================== 管道设置 ====================
ITEM_PIPELINES = {
    'fire_control_spider.pipelines.ValidationPipeline': 100,
    'fire_control_spider.pipelines.DuplicatesPipeline': 200,
    'fire_control_spider.pipelines.MediaDownloadPipeline': 300,
    'fire_control_spider.pipelines.JsonlWriterPipeline': 400,
}

# ==================== 缓存设置 ====================
HTTPCACHE_ENABLED = True
HTTPCACHE_EXPIRATION_SECS = 3600
HTTPCACHE_DIR = 'httpcache'
HTTPCACHE_IGNORE_HTTP_CODES = [503, 504, 505, 500, 403, 404, 408, 429]

# ==================== 输出设置 ====================
OUTPUT_DIR = Path("outputs")
JSONL_MAX_SIZE = 1024 * 1024 * 1024  # 1GB
JSONL_MAX_RECORDS = 50000  # 5万条记录
IMAGE_MAX_COUNT = 10000  # 每个image目录最大1万张图片
CSV_MAX_RECORDS = 100000  # CSV文件最大10万行

# ==================== 日志设置 ====================
# 启用详细日志以便调试
LOG_LEVEL = 'INFO'
LOG_ENABLED = True

# 禁用Telnet控制台
TELNETCONSOLE_ENABLED = False

# ==================== 扩展设置 ====================
EXTENSIONS = {
    'scrapy.extensions.telnet.TelnetConsole': None,
    'scrapy.extensions.logstats.LogStats': None,
    'scrapy.extensions.corestats.CoreStats': None,
    'scrapy.extensions.memusage.MemoryUsage': None,
    'scrapy.extensions.memdebug.MemoryDebugger': None,
    'scrapy.extensions.closespider.CloseSpider': None,
    'scrapy.extensions.feedexport.FeedExporter': None,
    'fire_control_spider.extensions.StatsExtension': None,
}

# ==================== 性能监控设置 ====================
# 启用性能监控
ENABLE_PERFORMANCE_MONITORING = True
PERFORMANCE_LOG_INTERVAL = 60  # 每60秒记录一次性能数据

# ==================== 并发监控设置 ====================
# 启用并发监控
ENABLE_CONCURRENCY_MONITORING = True
CONCURRENCY_LOG_INTERVAL = 30  # 每30秒记录一次并发数据

# ==================== 内存优化设置 ====================
# 限制内存使用
MEMORY_LIMIT_MB = 1024  # 1GB内存限制
MEMORY_CHECK_INTERVAL = 30  # 每30秒检查一次内存使用

# ==================== 调试设置 ====================
# 启用调试模式
DEBUG_MODE = True

# 记录慢请求
LOG_SLOW_REQUESTS = True
SLOW_REQUEST_THRESHOLD = 5.0  # 超过5秒的请求记录为慢请求

# 记录BeautifulSoup解析时间
LOG_BS_PARSING_TIME = True
BS_PARSING_THRESHOLD = 1.0  # 超过1秒的解析记录为慢解析 